Step 1:
## data extracted from New York Times state-level data from NYT Github repository
# https://github.com/nytimes/covid-19-data
## state-level population information from us_census_data available on GitHub repository:
# https://github.com/COVID19Tracking/associated-data/tree/master/us_census_data
### FINISH THE CODE HERE ###
# load COVID state-level data from NYT
cv_states <- as.data.frame(data.table::fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"))
### FINISH THE CODE HERE ###
# load state population data
state_pops <- as.data.frame(data.table::fread("https://raw.githubusercontent.com/COVID19Tracking/associated-data/master/us_census_data/us_census_2018_population_estimates_states.csv"))
state_pops$abb <- state_pops$state
state_pops$state <- state_pops$state_name
state_pops$state_name <- NULL
### FINISH THE CODE HERE
cv_states <- merge(cv_states, state_pops, by= "state")
Step 2. Look at the data
- Inspect the dimensions,
head, and tail of the data
- Inspect the structure of each variables. Are they in the correct format?
dim(cv_states)
## [1] 52894 9
head(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 1 Alabama 2022-10-13 1 1528739 20505 1 4887871 96.50939 AL
## 2 Alabama 2021-07-08 1 552911 11387 1 4887871 96.50939 AL
## 3 Alabama 2021-09-24 1 784484 13921 1 4887871 96.50939 AL
## 4 Alabama 2022-03-16 1 1290165 18944 1 4887871 96.50939 AL
## 5 Alabama 2022-11-07 1 1534287 20558 1 4887871 96.50939 AL
## 6 Alabama 2020-08-02 1 91444 1627 1 4887871 96.50939 AL
tail(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 52889 Wyoming 2021-05-24 56 59797 713 56 577737 5.950611 WY
## 52890 Wyoming 2022-11-16 56 179838 1924 56 577737 5.950611 WY
## 52891 Wyoming 2022-05-04 56 156745 1814 56 577737 5.950611 WY
## 52892 Wyoming 2021-07-17 56 63523 760 56 577737 5.950611 WY
## 52893 Wyoming 2022-04-27 56 156550 1812 56 577737 5.950611 WY
## 52894 Wyoming 2022-06-18 56 162160 1824 56 577737 5.950611 WY
str(cv_states)
## 'data.frame': 52894 obs. of 9 variables:
## $ state : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ date : IDate, format: "2022-10-13" "2021-07-08" ...
## $ fips : int 1 1 1 1 1 1 1 1 1 1 ...
## $ cases : int 1528739 552911 784484 1290165 1534287 91444 843161 836255 587405 1531305 ...
## $ deaths : int 20505 11387 13921 18944 20558 1627 16073 15834 11536 20533 ...
## $ geo_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ population : int 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
## $ pop_density: num 96.5 96.5 96.5 96.5 96.5 ...
## $ abb : chr "AL" "AL" "AL" "AL" ...
3. Format the data
# format the date
cv_states$date <- as.Date(cv_states$date, format="%Y-%m-%d")
# format the state and state abbreviation (abb) variables
state_list <- unique(cv_states$state)
cv_states$state <- factor(cv_states$state, levels = state_list)
abb_list <- unique(cv_states$abb)
cv_states$abb <- factor(cv_states$abb, levels = abb_list)
### FINISH THE CODE HERE
# order the data first by state, second by date
cv_states = cv_states[order(cv_states$state, cv_states$date),]
# Confirm the variables are now correctly formatted
str(cv_states)
## 'data.frame': 52894 obs. of 9 variables:
## $ state : Factor w/ 52 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ date : Date, format: "2020-03-13" "2020-03-14" ...
## $ fips : int 1 1 1 1 1 1 1 1 1 1 ...
## $ cases : int 6 12 23 29 39 51 78 106 131 157 ...
## $ deaths : int 0 0 0 0 0 0 0 0 0 0 ...
## $ geo_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ population : int 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
## $ pop_density: num 96.5 96.5 96.5 96.5 96.5 ...
## $ abb : Factor w/ 52 levels "AL","AK","AZ",..: 1 1 1 1 1 1 1 1 1 1 ...
head(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 664 Alabama 2020-03-13 1 6 0 1 4887871 96.50939 AL
## 15 Alabama 2020-03-14 1 12 0 1 4887871 96.50939 AL
## 400 Alabama 2020-03-15 1 23 0 1 4887871 96.50939 AL
## 89 Alabama 2020-03-16 1 29 0 1 4887871 96.50939 AL
## 505 Alabama 2020-03-17 1 39 0 1 4887871 96.50939 AL
## 234 Alabama 2020-03-18 1 51 0 1 4887871 96.50939 AL
tail(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 52121 Wyoming 2022-12-08 56 181695 1939 56 577737 5.950611 WY
## 52704 Wyoming 2022-12-09 56 181695 1939 56 577737 5.950611 WY
## 52108 Wyoming 2022-12-10 56 181695 1939 56 577737 5.950611 WY
## 52654 Wyoming 2022-12-11 56 181695 1939 56 577737 5.950611 WY
## 52467 Wyoming 2022-12-12 56 181695 1939 56 577737 5.950611 WY
## 52765 Wyoming 2022-12-13 56 182198 1951 56 577737 5.950611 WY
# Inspect the range values for each variable. What is the date range? The range of cases and deaths?
head(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 664 Alabama 2020-03-13 1 6 0 1 4887871 96.50939 AL
## 15 Alabama 2020-03-14 1 12 0 1 4887871 96.50939 AL
## 400 Alabama 2020-03-15 1 23 0 1 4887871 96.50939 AL
## 89 Alabama 2020-03-16 1 29 0 1 4887871 96.50939 AL
## 505 Alabama 2020-03-17 1 39 0 1 4887871 96.50939 AL
## 234 Alabama 2020-03-18 1 51 0 1 4887871 96.50939 AL
summary(cv_states)
## state date fips cases
## Washington : 1058 Min. :2020-01-21 Min. : 1.00 Min. : 1
## Illinois : 1055 1st Qu.:2020-11-11 1st Qu.:16.00 1st Qu.: 96570
## California : 1054 Median :2021-07-23 Median :29.00 Median : 357800
## Arizona : 1053 Mean :2021-07-22 Mean :29.78 Mean : 848487
## Massachusetts: 1047 3rd Qu.:2022-04-03 3rd Qu.:44.00 3rd Qu.: 1003619
## Wisconsin : 1043 Max. :2022-12-13 Max. :72.00 Max. :11630402
## (Other) :46584
## deaths geo_id population pop_density
## Min. : 0 Min. : 1.00 Min. : 577737 Min. : 1.292
## 1st Qu.: 1410 1st Qu.:16.00 1st Qu.: 1805832 1st Qu.: 43.659
## Median : 5279 Median :29.00 Median : 4468402 Median : 107.860
## Mean : 11680 Mean :29.78 Mean : 6402373 Mean : 422.968
## 3rd Qu.: 14699 3rd Qu.:44.00 3rd Qu.: 7535591 3rd Qu.: 229.511
## Max. :100252 Max. :72.00 Max. :39557045 Max. :11490.120
## NA's :1006
## abb
## WA : 1058
## IL : 1055
## CA : 1054
## AZ : 1053
## MA : 1047
## WI : 1043
## (Other):46584
min(cv_states$date)
## [1] "2020-01-21"
max(cv_states$date)
## [1] "2022-12-13"
4. Add new_cases and new_deaths and correct outliers
# Add variables for new_cases and new_deaths:
for (i in 1:length(state_list)) {
cv_subset = subset(cv_states, state == state_list[i])
cv_subset = cv_subset[order(cv_subset$date),]
# add starting level for new cases and deaths
cv_subset$new_cases = cv_subset$cases[1]
cv_subset$new_deaths = cv_subset$deaths[1]
### FINISH THE CODE HERE
for (j in 2:nrow(cv_subset)) {
cv_subset$new_cases[j] = cv_subset$cases[j] - cv_subset$cases[j-1]
cv_subset$new_deaths[j] = cv_subset$deaths[j] - cv_subset$deaths[j-1]
}
# include in main dataset
cv_states$new_cases[cv_states$state==state_list[i]] = cv_subset$new_cases
cv_states$new_deaths[cv_states$state==state_list[i]] = cv_subset$new_deaths
}
# Focus on recent dates
cv_states <- cv_states %>% dplyr::filter(date >= "2022-06-01")
### FINISH THE CODE HERE
# Inspect outliers in new_cases using plotly
p1<-ggplot(cv_states, aes(x = date, y = new_cases, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p1)
p1<-NULL # to clear from workspace
p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
p2<-NULL # to clear from workspace
# set negative new case or death counts to 0
cv_states$new_cases[cv_states$new_cases<0] = 0
cv_states$new_deaths[cv_states$new_deaths<0] = 0
# Recalculate `cases` and `deaths` as cumulative sum of updated `new_cases` and `new_deaths`
for (i in 1:length(state_list)) {
cv_subset = subset(cv_states, state == state_list[i])
# add starting level for new cases and deaths
cv_subset$cases = cv_subset$cases[1]
cv_subset$deaths = cv_subset$deaths[1]
### FINISH CODE HERE
for (j in 2:nrow(cv_subset)) {
cv_subset$cases[j] = cv_subset$new_cases[j] + cv_subset$new_cases[j-1]
cv_subset$deaths[j] = cv_subset$new_deaths[j] + cv_subset$new_deaths[j-1]
}
# include in main dataset
cv_states$cases[cv_states$state==state_list[i]] = cv_subset$cases
cv_states$deaths[cv_states$state==state_list[i]] = cv_subset$deaths
}
# Smooth new counts
cv_states$new_cases = zoo::rollmean(cv_states$new_cases, k=7, fill=NA, align='right') %>% round(digits = 0)
cv_states$new_deaths = zoo::rollmean(cv_states$new_deaths, k=7, fill=NA, align='right') %>% round(digits = 0)
# Inspect data again interactively
p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
#p2=NULL
5. Add additional variables
### FINISH CODE HERE
# add population normalized (by 100,000) counts for each variable
cv_states$per100k = as.numeric(format(round(cv_states$cases/(cv_states$population/100000),1),nsmall=1))
cv_states$newper100k = as.numeric(format(round(cv_states$new_cases/(cv_states$population/100000),1),nsmall=1))
cv_states$deathsper100k = as.numeric(format(round(cv_states$deaths/(cv_states$population/100000),1),nsmall=1))
cv_states$newdeathsper100k = as.numeric(format(round(cv_states$new_deaths/(cv_states$population/100000),1),nsmall=1))
# add a naive_CFR variable = deaths / cases
cv_states = cv_states %>% mutate(naive_CFR = round((deaths*100/cases),2))
# create a `cv_states_today` variable
cv_states_today = subset(cv_states, date==max(cv_states$date))
6. Explore scatterplots using plot_ly()
### FINISH CODE HERE
# pop_density vs. cases
cv_states_today %>%
plot_ly(x = ~pop_density, y = ~cases,
type = 'scatter', mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
# filter out "District of Columbia"
cv_states_today_filter <- cv_states_today %>% filter(state!="District of Columbia")
# pop_density vs. cases after filtering
cv_states_today_filter %>%
plot_ly(x = ~pop_density, y = ~cases,
type = 'scatter', mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
# pop_density vs. deathsper100k
cv_states_today_filter %>%
plot_ly(x = ~pop_density, y = ~deathsper100k,
type = 'scatter', mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
# Adding hoverinfo
cv_states_today_filter %>%
plot_ly(x = ~pop_density, y = ~deathsper100k,
type = 'scatter', mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5),
hoverinfo = 'text',
text = ~paste( paste(state, ":", sep=""), paste(" Cases per 100k: ", per100k, sep="") ,
paste(" Deaths per 100k: ", deathsper100k, sep=""), sep = "<br>")) %>%
layout(title = "Population-normalized COVID-19 deaths (per 100k) vs. population density for US states",
yaxis = list(title = "Deaths per 100k"), xaxis = list(title = "Population Density"),
hovermode = "compare")
7. Explore scatterplot trend interactively using ggplotly() and geom_smooth()
### FINISH CODE HERE
p <- ggplot(cv_states_today_filter, aes(x=pop_density, y=deathsper100k, size=population)) + geom_point() + geom_smooth()
ggplotly(p)
8. Multiple line chart
### FINISH CODE HERE
# Line chart for naive_CFR for all states over time using `plot_ly()`
plot_ly(cv_states, x = ~date, y = ~naive_CFR, color = ~state, type = "scatter", mode = "lines")
### FINISH CODE HERE
# Line chart for Florida showing new_cases and new_deaths together
cv_states %>% filter(state=="Florida") %>% plot_ly(x = ~date, y = ~new_cases, type = "scatter", mode = "lines") %>% add_lines(x = ~date, y = ~new_deaths, type = "scatter", mode = "lines")